Introduction

This report aims to provide a high-level review of our text datasets. These datasets are used to build predictive modeling, and the end model will be hosted on the shiny server. The final application can be accessed by a link and can be used without any coding. Before the modeling process, it is crucial to understand the data first.

Exploratory Data Analysis

This analysis aims to understand the size of each dataset, freuqent words, and frequent word pairs.

Count Lines and Words

The table below shows the number of lines and in each dataset.

library('dplyr')
library('tokenizers')
library('stopwords')
library('stringr')
library('plotly')
library('tm')
library('ggplot2')
library('DT')

#load data
con1 <- file("C:/Data/final/en_US/en_US.twitter.txt", "r") 
con2 <- file("C:/Data/final/en_US/en_US.news.txt", "r") 
con3 <- file("C:/Data/final/en_US/en_US.blogs.txt", "r") 

twitter <- readLines(con1)
news <- readLines(con2)
blog <- readLines(con3)



##count words
word_cnt_twitter <- sum(str_count(twitter, pattern = '\\w+'))
word_cnt_news <- sum(str_count(news, pattern = '\\w+'))
word_cnt_blog <- sum(str_count(blog, pattern = '\\w+'))

DT::datatable(data.frame(
  'Dataset' = c('Twitter', 'News', 'Blog'),
  'Num_of_Lines' = c(length(twitter),
                     length(news),
                     length(blog)),
  'Num_of_Words' = c(word_cnt_twitter,
                     word_cnt_news,
                     word_cnt_blog)
)) %>%
  DT::formatRound(columns = c('Num_of_Lines', 
                              'Num_of_Words'),
                  interval = 3, digits = 0)

Frequent Words

Due to large size, I sample 2% of each dataset. The plots in this section shows the top 10 frequent words in each dataset.

#tokenize word
token_word_twitter <- tokenize_words(twitter,
                                     lowercase = T,
                                     strip_punct = T, 
                                     simplify = T, 
                                     strip_numeric = T,
                                     stopwords = stopwords('en'))
token_word_news <- tokenize_words(news,
                                  lowercase = T,
                                  strip_punct = T, 
                                  simplify = T, 
                                  strip_numeric = T,
                                  stopwords = stopwords('en'))
token_word_blog <- tokenize_words(blog,
                                  lowercase = T,
                                  strip_punct = T, 
                                  simplify = T, 
                                  strip_numeric = T,
                                  stopwords = stopwords('en'))

# create a function for sampling
nlp_sample_list <- function(pct, df){
  upper <- length(df)
  size = round(pct * upper)
  list <- sort(round(runif(size,1,upper)))
  return(list)
}

# create tokenized word data frame based on 2% sample
set.seed(666)
tk_twitter_df <- data.frame(
  'Words' = c()
)
for (i in nlp_sample_list(0.02, token_word_twitter)){
  tk_twitter_df <- bind_rows(
    tk_twitter_df,
    data.frame(
      'Words' = token_word_twitter[[i]]
    ) 
     
  )
}

set.seed(666)
tk_news_df <- data.frame(
  'Words' = c()
)
for (i in nlp_sample_list(0.02, token_word_news)){
  tk_news_df <- bind_rows(
    tk_news_df,
    data.frame(
      'Words' = token_word_news[[i]]
    ) 
    
  )
}

set.seed(666)
tk_blog_df <- data.frame(
  'Words' = c()
)
for (i in nlp_sample_list(0.02, token_word_blog)){
  tk_blog_df <- bind_rows(
    tk_blog_df,
    data.frame(
      'Words' = token_word_blog[[i]]
    ) 
    
  )
}

tk_twitter_df %>%
  dplyr::group_by(Words) %>%
  dplyr::summarise(
    freq = n()
  ) %>%
  arrange(desc(freq)) %>%
  top_n(10,freq) %>%
  plot_ly(
    x = ~Words, 
    y = ~freq, 
    type = 'bar', text = ~freq, 
    textposition = 'auto',
    marker = list(
      color = '#5DADE2',
      line = list(color = 'rgb(8,48,107)', 
                  width = 1.5))) %>%
  layout(
      title = "Top 10 Frequent Word out of 2% in the Twitter Dataset",
      xaxis = list(title = 'Word Pairs',
                   categoryorder = 'total descending'),
      yaxis = list(
        title = 'Frequency')
    )
tk_news_df %>%
  dplyr::group_by(Words) %>%
  dplyr::summarise(
    freq = n()
  ) %>%
  arrange(desc(freq)) %>%
  top_n(10,freq) %>%
  plot_ly(
    x = ~Words, 
    y = ~freq, 
    type = 'bar', text = ~freq, 
    textposition = 'auto',
    marker = list(
      color = '#5DADE2',
      line = list(color = 'rgb(8,48,107)', 
                  width = 1.5))) %>%
  layout(
    title = "Top 10 Frequent Word out of 2% in the News Dataset",
    xaxis = list(title = 'Word Pairs', 
                 categoryorder = 'total descending'),
    yaxis = list(title = 'Frequency'))
tk_blog_df %>%
  dplyr::filter(! Words %in% stopwords('en') ) %>%
  dplyr::group_by(Words) %>%
  dplyr::summarise(
    freq = n()
  ) %>%
  arrange(desc(freq)) %>%
  top_n(10,freq) %>%
  plot_ly(
    x = ~Words, 
    y = ~freq, 
    type = 'bar', text = ~freq, 
    textposition = 'auto',
    marker = list(
      color = '#5DADE2',
      line = list(color = 'rgb(8,48,107)', 
                  width = 1.5))) %>%
  layout(
    title = "Top 10 Frequent Word out of 2% in the Blogs Dataset",
    xaxis = list(title = 'Word Pairs', categoryorder = 'total descending'),
    yaxis = list(title = 'Frequency'))

Frequent 2-gram Word Pairs

The plots in this section shows the top 10 frequent 2-gram word pairs in each dataset.

#define function
n_gram_freq <- function(n, df, pct, topn){
  library('dplyr')
  library('tokenizers')
  library('stopwords')
  n_gram <- tokenize_ngrams(df, 
                            n = n, 
                            n_min = n,
                            stopwords = stopwords("en"),
                            lowercase = T, 
                            simplify = T)
  
  n_gram_df <- data.frame(
    'Word_Pairs' = c()
  )
  
  for (i in nlp_sample_list(pct,n_gram)){
    n_gram_df <- bind_rows(
      n_gram_df,
      data.frame(
        'Word_Pairs' = n_gram[[i]]
      ) 
      
    )
  }
  n_gram_freq <- n_gram_df %>%
    dplyr::filter(is.na(Word_Pairs) == F) %>%
    dplyr::group_by(Word_Pairs) %>%
    dplyr::summarise(freq = n()) %>%
    arrange(desc(freq)) %>%
    top_n(topn, freq) %>%
    dplyr::mutate(Word_Pairs = gsub('â',"'",Word_Pairs))
  return(n_gram_freq)
}

set.seed(666)
twitter_2n_gram <- n_gram_freq(2, twitter, 0.02, 10)
twitter_2n_gram %>%
  plot_ly(
    x = ~Word_Pairs, 
    y = ~freq, 
    type = 'bar', text = ~freq, 
    textposition = 'auto',
    marker = list(
      color = '#5DADE2',
      line = list(color = 'rgb(8,48,107)', 
                  width = 1.5))) %>%
    layout(
      title = "Top 10 Frequent 2-gram Word Pairs out of 2% in the Twitter Dataset",
      xaxis = list(title = 'Word Pairs', categoryorder = 'total descending'),
      yaxis = list(title = 'Frequency'))
set.seed(666)
news_2n_gram <- n_gram_freq(2, news, 0.02, 10)
news_2n_gram %>%
  plot_ly(
    x = ~Word_Pairs, 
    y = ~freq, 
    type = 'bar', text = ~freq, 
    textposition = 'auto',
    marker = list(
      color = '#5DADE2',
      line = list(color = 'rgb(8,48,107)', 
                  width = 1.5))) %>%
  layout(
    title = "Top 10 Frequent 2-gram Word Pairs out of 2% in the News Dataset",
    xaxis = list(title = 'Word Pairs', categoryorder = 'total descending'),
    yaxis = list(title = 'Frequency'))
set.seed(666)
blog_2n_gram <- n_gram_freq(2, blog, 0.02, 10)
blog_2n_gram %>%
  plot_ly(
    x = ~Word_Pairs, 
    y = ~freq, 
    type = 'bar', text = ~freq, 
    textposition = 'auto',
    marker = list(
      color = '#5DADE2',
      line = list(color = 'rgb(8,48,107)', 
                  width = 1.5))) %>%
  layout(
    title = "Top 10 Frequent 2-gram Word Pairs out of 2% in the Blog Dataset",
    xaxis = list(title = 'Word Pairs', categoryorder = 'total descending'),
    yaxis = list(title = 'Frequency'))

Frequent 3-gram Word Pairs

The plots in this section shows the top 10 frequent 3-gram word pairs in each dataset.

set.seed(666)
twitter_3n_gram <- n_gram_freq(3, twitter, 0.02, 10)
twitter_3n_gram %>%
  plot_ly(
    x = ~Word_Pairs, 
    y = ~freq, 
    type = 'bar', text = ~freq, 
    textposition = 'auto',
    marker = list(
      color = '#5DADE2',
      line = list(color = 'rgb(8,48,107)', 
                  width = 1.5))) %>%
    layout(
      title = "Top 10 Frequent 3-gram Word Pairs out of 2% in the Twitter Dataset",
      xaxis = list(title = 'Word Pairs', categoryorder = 'total descending'),
      yaxis = list(title = 'Frequency'))
set.seed(666)
news_3n_gram <- n_gram_freq(3, news, 0.02, 10)
news_3n_gram %>%
  plot_ly(
    x = ~Word_Pairs, 
    y = ~freq, 
    type = 'bar', text = ~freq, 
    textposition = 'auto',
    marker = list(
      color = '#5DADE2',
      line = list(color = 'rgb(8,48,107)', 
                  width = 1.5))) %>%
  layout(
    title = "Top 10 Frequent 3-gram Word Pairs out of 2% in the News Dataset",
    xaxis = list(title = 'Word Pairs', categoryorder = 'total descending'),
    yaxis = list(title = 'Frequency'))
set.seed(666)
blog_3n_gram <- n_gram_freq(3, blog, 0.02, 10)
blog_3n_gram %>%
  plot_ly(
    x = ~Word_Pairs, 
    y = ~freq, 
    type = 'bar', text = ~freq, 
    textposition = 'auto',
    marker = list(
      color = '#5DADE2',
      line = list(color = 'rgb(8,48,107)', 
                  width = 1.5))) %>%
  layout(
    title = "Top 10 Frequent 3-gram Word Pairs out of 2% in the Blog Dataset",
    xaxis = list(title = 'Word Pairs', categoryorder = 'total descending'),
    yaxis = list(title = 'Frequency'))